library(DESeq2)
## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 4.1.3
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
library(tximport)
library(affy)
library(ggplot2)
library(vsn)
library(ggrepel)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(FactoMineR)
library(genefilter)
##
## Attaching package: 'genefilter'
## The following objects are masked from 'package:MatrixGenerics':
##
## rowSds, rowVars
## The following objects are masked from 'package:matrixStats':
##
## rowSds, rowVars
library(reshape2)
library(RColorBrewer)
library(pheatmap)
library(EnhancedVolcano)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
library(viridis)
## Loading required package: viridisLite
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:Biobase':
##
## combine
## The following object is masked from 'package:matrixStats':
##
## count
## The following objects are masked from 'package:GenomicRanges':
##
## intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
##
## intersect
## The following objects are masked from 'package:IRanges':
##
## collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
##
## first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tracktables)
library(GenomicFeatures)
## Loading required package: AnnotationDbi
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
##
## select
library(ChIPseeker)
##
## Registered S3 method overwritten by 'ggtree':
## method from
## identify.gg ggfun
## ChIPseeker v1.30.3 For help: https://guangchuangyu.github.io/software/ChIPseeker
##
## If you use ChIPseeker in published research, please cite:
## Guangchuang Yu, Li-Gen Wang, Qing-Yu He. ChIPseeker: an R/Bioconductor package for ChIP peak annotation, comparison and visualization. Bioinformatics 2015, 31(14):2382-2383
library(ggbio)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## Need specific help about ggbio? try mailing
## the maintainer or visit https://lawremi.github.io/ggbio/
##
## Attaching package: 'ggbio'
## The following objects are masked from 'package:ggplot2':
##
## geom_bar, geom_rect, geom_segment, ggsave, stat_bin, stat_identity,
## xlim
library(readxl)
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following object is masked from 'package:ChIPseeker':
##
## .
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:matrixStats':
##
## count
## The following object is masked from 'package:IRanges':
##
## desc
## The following object is masked from 'package:S4Vectors':
##
## rename
library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.10.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
##
## The new InteractiveComplexHeatmap package can directly export static
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
## ! pheatmap() has been masked by ComplexHeatmap::pheatmap(). Most of the arguments
## in the original pheatmap() are identically supported in the new function. You
## can still use the original function by explicitly calling pheatmap::pheatmap().
##
## Attaching package: 'ComplexHeatmap'
## The following object is masked from 'package:pheatmap':
##
## pheatmap
## The following object is masked from 'package:genefilter':
##
## dist2
library(clusterProfiler)
## clusterProfiler v4.2.2 For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
##
## If you use clusterProfiler in published research, please cite:
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141
##
## Attaching package: 'clusterProfiler'
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:AnnotationDbi':
##
## select
## The following object is masked from 'package:IRanges':
##
## slice
## The following object is masked from 'package:S4Vectors':
##
## rename
## The following object is masked from 'package:stats':
##
## filter
library(enrichplot)
library(mgsa)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:IRanges':
##
## space
## The following object is masked from 'package:S4Vectors':
##
## space
## The following object is masked from 'package:stats':
##
## lowess
colors <- viridis(5)[c(1, 5)]
Loading count table
cnts <- readRDS("/Volumes/GoogleDrive/My Drive/Greally_Lab/Toxoplasma/Toxoplasma/ATAC-Seq/Quantification/Infected_optimal_peaks.counts.rds")
sampleTable <- data.frame(condition = factor(c(rep("Infected",
3), rep("Uninfected", 3))), row.names = colnames(cnts))
dds <- DESeqDataSetFromMatrix(countData = assay(cnts), colData = sampleTable,
design = ~condition, rowRanges = rowRanges(cnts))
# total counts
barplot(colSums(counts(dds)), horiz = TRUE, col = colors[colData(dds)$condition],
las = 1, xlab = "Counts", main = "Total Counts")
legend("topright", legend = levels(colData(dds)$condition), lwd = 1,
col = colors)
# Unique regions
barplot(colSums(counts(dds) > 0), horiz = TRUE, col = colors[colData(dds)$condition],
las = 1, xlab = "Unique Regions", main = "Regions detected")
legend("topright", legend = levels(colData(dds)$condition), lwd = 1,
col = colors)
# plotting an unnormalized density plot of the log
# transformed counts
plotDensity(log2(counts(dds) + 1), lty = 1, col = colors[colData(dds)$condition],
lwd = 1, xlab = "log2(Counts + 1)", main = "Raw Counts")
legend("topright", legend = levels(colData(dds)$condition), lwd = 1,
col = colors)
# plotting an unnormalized box plot of the log transformed
# counts
boxplot(log2(counts(dds) + 1), col = colors[colData(dds)$condition],
cex.axis = 0.5, las = 1, horizontal = TRUE, xlab = "log2(Counts + 1)",
ylab = "Samples", main = "Raw Counts")
legend("topright", legend = levels(colData(dds)$condition), lwd = 1,
col = colors)
Remove uninfected since there are no counts
sampleTable <- data.frame(condition = factor(c(rep("Infected",
3))), row.names = colnames(cnts)[1:3])
dds <- DESeqDataSetFromMatrix(countData = assay(cnts)[, 1:3],
colData = sampleTable, design = ~1, rowRanges = rowRanges(cnts)[,
1:3])
# Run DESeq
dds <- DESeq(dds)
## Warning in DESeq(dds): the design is ~ 1 (just an intercept). is this intended?
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
# total reads
barplot(colSums(counts(dds)), horiz = TRUE, col = colors[colData(dds)$condition],
las = 1, xlab = "Fragments", ylab = "Samples", main = "Total Fragments")
# density plot normalized
plotDensity(log2(counts(dds, normalized = TRUE) + 1), lty = 1,
col = colors[colData(dds)$condition], lwd = 1, xlab = "log2(Counts + 1)",
main = "Normalized Counts")
legend("topright", legend = levels(colData(dds)$condition), lwd = 1,
col = colors)
# box plot normalized
boxplot(log2(counts(dds, normalized = TRUE) + 1), col = colors[colData(dds)$condition],
cex.axis = 0.5, las = 1, horizontal = TRUE, xlab = "log2(Counts + 1)",
ylab = "Samples", main = "Normalized Counts")
legend("topright", legend = levels(colData(dds)$condition), lwd = 1,
col = colors)
## Computing mean and variance
norm.counts <- counts(dds, normalized = TRUE)
mean.counts <- rowMeans(norm.counts)
variance.counts <- apply(norm.counts, MARGIN = 1, var)
## Mean and variance relationship
mean.var.col <- densCols(x = log2(mean.counts), y = log2(variance.counts))
plot(x = log2(mean.counts), y = log2(variance.counts), pch = 16,
cex = 0.5, col = mean.var.col, main = "Mean-variance relationship",
xlab = "Mean log2(normalized counts) per gene", ylab = "Variance of log2(normalized counts)",
panel.first = grid())
abline(a = 1, b = 1, col = "red") # a linear line to compare against
##Estimation of Dispersion
plotDispEsts(dds)
sum(mcols(dds, use.names = TRUE)[, "dispOutlier"])
## [1] 27
dds_rlog <- rlog(dds, blind = FALSE)
dds_vst <- vst(dds, blind = FALSE)
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
Log2 transformed counts just for comparison. It looks like this does a good enough job.
dds_log2 <- log2(counts(dds) + 1)
meanSdPlot(dds_log2)
## Warning: Computation failed in `stat_binhex()`:
Compare rlog and vst transformations
meanSdPlot(assay(dds_rlog))
## Warning: Computation failed in `stat_binhex()`:
meanSdPlot(assay(dds_vst))
## Warning: Computation failed in `stat_binhex()`:
sampleDists <- dist(t(assay(dds_vst)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(colnames(dds_vst))
colnames(sampleDistMatrix) <- paste(colnames(dds_vst))
colors <- magma(255)
pheatmap(sampleDistMatrix, clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists, col = colors)
require(corrplot)
## Loading required package: corrplot
## corrplot 0.92 loaded
library("PerformanceAnalytics")
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
## The following object is masked from 'package:S4Vectors':
##
## first
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:gplots':
##
## textplot
## The following object is masked from 'package:graphics':
##
## legend
res <- cor(assay(dds_vst))
corrplot(res, type = "upper", order = "hclust", tl.col = "black",
tl.srt = 45)
chart.Correlation(assay(dds_vst), histogram = TRUE, pch = 19)
Review results in IGV
dir = "/Volumes/GoogleDrive/My Drive/Greally_Lab/Toxoplasma/Toxoplasma/ATAC-Seq/Differential Accessiblity/"
res.granges <- rowRanges(dds)
res.granges <- cbind(as.data.frame(res.granges@ranges), counts(dds,
normalized = TRUE)) # add counts
res.granges <- res.granges[order(rowRanges(dds)$baseMean, decreasing = TRUE),
]
makebedtable(res.granges, "res.granges.html", dir)
## [1] "/Volumes/GoogleDrive/My Drive/Greally_Lab/Toxoplasma/Toxoplasma/ATAC-Seq/Differential Accessiblity//res.granges.html"
library(ChIPseeker)
# custom annoation
chrominfo <- read.table("/Volumes/GoogleDrive/My Drive/Greally_Lab/Toxoplasma/Genomes/ME49/GCF_000006565.2_TGA4_genomic.chrom_info.txt",
header = TRUE)
TxDb.TGA4 <- makeTxDbFromGFF("/Volumes/GoogleDrive/My Drive/Greally_Lab/Toxoplasma/Genomes/ME49/GCF_000006565.2_TGA4_genomic.gff",
format = "gff", organism = "Toxoplasma", taxonomyId = 1208370,
dbxrefTag = "locus_tag", chrominfo = chrominfo)
## Import genomic features from the file as a GRanges object ... OK
## Prepare the 'metadata' data frame ... OK
## Make the TxDb object ...
## Warning in .extract_transcripts_from_GRanges(tx_IDX, gr, mcols0$type, mcols0$ID, : some transcripts have no "transcript_id" attribute ==> their name
## ("tx_name" column in the TxDb object) was set to NA
## Warning in .extract_transcripts_from_GRanges(tx_IDX, gr, mcols0$type, mcols0$ID, : the transcript names ("tx_name" column in the TxDb object) imported
## from the "transcript_id" attribute are not unique
## OK
customAnnotation <- list(version = "custom", TGA4 = transcripts(TxDb.TGA4))
# annotate
res.granges.annot <- rowRanges(dds)
res.granges.annot@elementMetadata <- cbind(res.granges.annot@elementMetadata,
assay(dds_vst))
res.granges.annot <- annotatePeak(res.granges.annot, TxDb = TxDb.TGA4,
level = "gene")
## >> preparing features information... 2022-04-20 13:23:18
## >> identifying nearest features... 2022-04-20 13:23:18
## >> calculating distance from peak to TSS... 2022-04-20 13:23:18
## >> assigning genomic annotation... 2022-04-20 13:23:18
## >> assigning chromosome lengths 2022-04-20 13:23:20
## >> done... 2022-04-20 13:23:20
# plot
plotAnnoPie(res.granges.annot)
plotDistToTSS(res.granges.annot, title = "Distribution of transcription factor-binding loci\nrelative to TSS")
res.granges.annot <- as.GRanges(res.granges.annot)
res.granges.annot <- res.granges.annot[abs(res.granges.annot$distanceToTSS) <
5000]
# write the results
write.csv(res.granges.annot, paste(dir, "Infected_DESeq2_annotated_chipseeker_ATAC.csv",
sep = ""))
output <- res.granges.annot[, 22:32]
names(output) <- NULL
makebedtable(output, "res.granges.annotate.chipseeker.html",
dir)
## [1] "/Volumes/GoogleDrive/My Drive/Greally_Lab/Toxoplasma/Toxoplasma/ATAC-Seq/Differential Accessiblity//res.granges.annotate.chipseeker.html"
Combine annotated and non-annotated
res.granges.merge <- merge(rowRanges(dds), res.granges.annot)
# add meta data
tga4_map <- read.table("/Volumes/GoogleDrive/My Drive/Greally_Lab/Toxoplasma/Genomes/ME49/ID-description_mapping.txt",
sep = "\t")
mcols(res.granges.merge)$description <- tga4_map[match(res.granges.merge$geneId,
tga4_map$V1), ]$V2
Plot peaks over chromosomes
chrs <- c("NC_031467.1", "NC_031468.1", "NC_031469.1", "NC_031470.1",
"NC_031471.1", "NC_031472.1", "NC_031473.1", "NC_031474.1",
"NC_031475.1", "NC_031476.1", "NC_031477.1", "NC_031478.1",
"NC_031479.1", "NC_031480.1")
map <- read_excel("/Volumes/GoogleDrive/My Drive/Greally_Lab/Toxoplasma/Genomes/ME49/NCBI_chromosome_mapping.xlsx")
dds.granges.plot <- res.granges.merge
dds.granges.plot <- keepSeqlevels(dds.granges.plot, pruning.mode = "tidy",
chrs)
seqlevels(dds.granges.plot) <- map$`Molecule name`[-c(15:16)] # leave out unmappedContig and apicoplast
# only show top 10 labels
dds.granges.plot <- dds.granges.plot[order(dds.granges.plot$baseMean,
decreasing = TRUE), ]
labels <- dds.granges.plot$description
dds.granges.plot$description <- ""
dds.granges.plot$description[1:10] <- labels[1:10]
plotGrandLinear(dds.granges.plot, aes(y = log2(baseMean)), color = c("red",
"blue"), ylab = "Log2(mean accessibility)", spaceline = TRUE,
xlim = c(0, 7e+07)) + theme(axis.text.x = element_text(angle = 45,
hjust = 1), text = element_text(size = 15, family = "Arial")) +
geom_text_repel(aes(label = dds.granges.plot$description),
color = "black", max.overlaps = Inf, size = 5)
## using coord:genome to parse x scale
## Warning: Removed 2 rows containing missing values (geom_text_repel).
autoplot(dds.granges.plot, layout = "karyogram", aes(color = log2(baseMean),
fill = log2(baseMean))) + scale_color_gradient(low = "blue",
high = "red") + scale_fill_gradient(low = "blue", high = "red")
## Warning in getIdeoGR(data): geom(ideogram) need valid seqlengths information for accurate mapping,
## now use reduced information as ideogram...
## Warning in getIdeoGR(data): geom(ideogram) need valid seqlengths information for accurate mapping,
## now use reduced information as ideogram...
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
Heatmap
dds_vst.mat <- as.matrix(res.granges.annot@elementMetadata[,
c("I1", "I2", "I3")])
# divide into 3 clusters
clust <- kmeans(dds_vst.mat, 3)
split <- factor(paste0("Cluster\n", clust$cluster), levels = c("Cluster\n1",
"Cluster\n2", "Cluster\n3"))
HM <- Heatmap(dds_vst.mat, name = "Variance-stabilized Accessibility",
row_names_gp = gpar(fontsize = 7), split = split, cluster_row_slices = TRUE,
border = TRUE, cluster_rows = TRUE, cluster_columns = FALSE,
show_row_names = FALSE, column_names_rot = 0, show_parent_dend_line = FALSE,
row_dend_width = unit(20, "mm"), heatmap_legend_param = list(direction = "horizontal"))
draw(HM, heatmap_legend_side = "bottom")
Setting up
high <- res.granges.annot$geneId[clust$cluster == 1]
high <- high[-c(1:7)] # remove apicoplast
medium <- res.granges.annot$geneId[clust$cluster == 2]
low <- res.granges.annot$geneId[clust$cluster == 3]
## build GO database read GO annotation file
GAF <- readGAF(filename = "/Volumes/GoogleDrive/My Drive/Greally_Lab/Toxoplasma/Genomes/ME49/ToxoDB-53_TgondiiME49_GO.gaf")
## Loading required package: GO.db
## Loading required package: RSQLite
## Loading required package: DBI
## Warning: RSQLite: Passing numeric values to row.names is deprecated. Pass a
## logical or a column name.
## Warning in result_fetch(res@ptr, n = n): SQL statements must be issued with
## dbExecute() or dbSendStatement() instead of dbGetQuery() or dbSendQuery().
## Warning in result_fetch(res@ptr, n = n): SQL statements must be issued with
## dbExecute() or dbSendStatement() instead of dbGetQuery() or dbSendQuery().
# extract relevant info. unfortunately could not find
# accessor functions for all required info, thus sometimes
# had to utilize object slots directly (with @)
mapping.index <- GAF@itemName2ItemIndex
ID.annotations <- itemAnnotations(GAF)
GO.sets <- GAF@sets
GO.annotation <- setAnnotations(GAF)
# create a 2-column data frame with GOID and ID index after
# little further processing, this will be used as input for
# clusterProfiler
GO.df <- data.frame(GOID = rep(names(GO.sets), sapply(GO.sets,
length)), ID.index = unlist(GO.sets), row.names = NULL)
# do some processing for objects GO.annotation and GO.df in
# both remove category 'all', and to GO.df also add column
# with Uniprot ids
# GO.annotation
GO.annotation <- GO.annotation[GO.annotation[, "term"] != "all",
]
GO.annotation[, "GOID"] <- rownames(GO.annotation)
# GO.df
GO.df <- GO.df[GO.df[, "GOID"] != "all", ]
GO.df[, "GeneID"] <- names(mapping.index[GO.df[, "ID.index"]])
Compare clusters
cluster.list <- list(High = high, Medium = medium, Low = low)
ck <- compareCluster(geneCluster = cluster.list, fun = "enricher",
TERM2GENE = GO.df[, c("GOID", "GeneID")], TERM2NAME = GO.annotation[,
c("GOID", "term")])
dotplot(ck, showCategory = 20)
write.csv(ck@compareClusterResult, paste(dir, "GO_enrichment.csv",
sep = ""))